knitr::opts_chunk$set(eval = FALSE, warning = FALSE, message = FALSE)
oldpath <- .libPaths()[1]
newpath <- "/home/biostacs/R/backup/env_31723062"
library(ggplot2)
library(Seurat, lib.loc = newpath)

导入第二部分的结果

if (!exists("seu.CCA")) {
  seu.CCA <- readRDS("tmp/s2-seu_CCA.rds")
}

seu.CCA <- RunTSNE(seu.CCA, reduction.use = "cca.aligned", dims.use = 1:20)

查看tSNE降维图,检查批次效应矫正的效果

tsne.emb <- GetDimReduction(seu.CCA, reduction.type = "tsne", slot = "cell.embeddings")
tsne.emb <- as.data.frame(tsne.emb)
tsne.emb$sample_id <- as.character(seu.CCA@meta.data[rownames(tsne.emb),]$sample_id)
ggplot(data = tsne.emb, aes(x=tSNE_1, y=tSNE_2, color=sample_id)) + 
  geom_point(size = .1) +
  facet_wrap(~sample_id, ncol = 3) + 
  theme_bw() + 
  theme(legend.position = "none",
        strip.text = element_text(size = 15, face = "bold"))

Clusters were identifed using the function “FindClusters” from Seurat using default parameters.

聚类

seu.CCA <- FindClusters(seu.CCA, reduction.type = "cca.aligned", dims.use = 1:20, save.SNN = TRUE)

对聚类结果进行评价

The robustness of the clusters was calculated using the function “AssessNodes” from Seurat. For each cluster, a phylogenetic tree based on the distance matrix in gene expression space was computed. Next, Seurat computed an out-of-bag error for a random forest classifer trained on each internal node split of the tree.

构建phylogenetic tree

seu.CCA <- BuildClusterTree(seu.CCA, reorder.numeric = TRUE, do.reorder = TRUE, do.plot = FALSE)
## [1] "Finished averaging RNA for cluster 0"
## [1] "Finished averaging RNA for cluster 1"
## [1] "Finished averaging RNA for cluster 2"
## [1] "Finished averaging RNA for cluster 3"
## [1] "Finished averaging RNA for cluster 4"
## [1] "Finished averaging RNA for cluster 5"
## [1] "Finished averaging RNA for cluster 6"
## [1] "Finished averaging RNA for cluster 7"
## [1] "Finished averaging RNA for cluster 8"
## [1] "Finished averaging RNA for cluster 9"
## [1] "Finished averaging RNA for cluster 10"
## [1] "Finished averaging RNA for cluster 11"
## [1] "Finished averaging RNA for cluster 12"
## [1] "Finished averaging RNA for cluster 13"
## [1] "Finished averaging RNA for cluster 14"
## [1] "Finished averaging RNA for cluster 15"
## [1] "Finished averaging RNA for cluster 16"
## [1] "Finished averaging RNA for cluster 17"
## [1] "Finished averaging RNA for cluster 18"
## [1] "Finished averaging RNA for cluster 1"
## [1] "Finished averaging RNA for cluster 2"
## [1] "Finished averaging RNA for cluster 3"
## [1] "Finished averaging RNA for cluster 4"
## [1] "Finished averaging RNA for cluster 5"
## [1] "Finished averaging RNA for cluster 6"
## [1] "Finished averaging RNA for cluster 7"
## [1] "Finished averaging RNA for cluster 8"
## [1] "Finished averaging RNA for cluster 9"
## [1] "Finished averaging RNA for cluster 10"
## [1] "Finished averaging RNA for cluster 11"
## [1] "Finished averaging RNA for cluster 12"
## [1] "Finished averaging RNA for cluster 13"
## [1] "Finished averaging RNA for cluster 14"
## [1] "Finished averaging RNA for cluster 15"
## [1] "Finished averaging RNA for cluster 16"
## [1] "Finished averaging RNA for cluster 17"
## [1] "Finished averaging RNA for cluster 18"
## [1] "Finished averaging RNA for cluster 19"

计算out-of-bag error,文章认为oobe<0.1的节点分类是可靠的。

oob <- AssessNodes(seu.CCA)
oob
##    node        oobe
## 1    20 0.026977975
## 2    21 0.017106612
## 3    24 0.038723404
## 4    29 0.003200190
## 5    34 0.012274645
## 6    37 0.137931034
## 7    35 0.014610390
## 8    30 0.022845275
## 9    25 0.008376289
## 10   26 0.091981700
## 11   27 0.008272506
## 12   31 0.002969121
## 13   36 0.016689847
## 14   22 0.008639053
## 15   23 0.002000762
## 16   28 0.013710618
## 17   32 0.098403921
## 18   33 0.026643747

可以看到节点37的oobe=0.138>0.1,我们可以将节点37的两个分类合并,从phylogenetic tree上看,就是cluster2和cluster3。

PlotClusterTree(seu.CCA)

tsne.emb$ident <- factor(as.character(seu.CCA@ident), levels = levels(seu.CCA@ident))

ggplot(data = tsne.emb, aes(x=tSNE_1, y=tSNE_2, color=ident)) + 
  geom_point(size = .1) +
  facet_wrap(~sample_id, ncol = 3) + 
  guides(colour = guide_legend(title = "Cluster", 
                               title.hjust=0.5, 
                               keyheight = 1.5, 
                               override.aes = list(size=6))) + 
  theme_bw() + 
  theme(strip.text = element_text(size = 15, face = "bold"))

查看cluster2和cluster3

tsne.emb$is_2_3 <- tsne.emb$ident %in% c(2,3)

ggplot(data = tsne.emb, aes(x=tSNE_1, y=tSNE_2, color=is_2_3)) + 
  geom_point(size = .1) +
  scale_color_manual(values = c("grey","red")) + 
  guides(colour = guide_legend(title = "Cluster 2&3",
                               title.hjust=0.5, 
                               keyheight = 1.5, 
                               override.aes = list(size=6))) + 
  theme_bw() + 
  theme(strip.text = element_text(size = 15, face = "bold"))

合并节点37,重新构建phylogenetic tree

seu.CCA <- MergeNode(seu.CCA, node.use = 37, rebuild.tree = TRUE, 
                     reorder.numeric = TRUE, do.reorder = TRUE, do.plot = FALSE)
## [1] "Finished averaging RNA for cluster 1"
## [1] "Finished averaging RNA for cluster 2"
## [1] "Finished averaging RNA for cluster 4"
## [1] "Finished averaging RNA for cluster 5"
## [1] "Finished averaging RNA for cluster 6"
## [1] "Finished averaging RNA for cluster 7"
## [1] "Finished averaging RNA for cluster 8"
## [1] "Finished averaging RNA for cluster 9"
## [1] "Finished averaging RNA for cluster 10"
## [1] "Finished averaging RNA for cluster 11"
## [1] "Finished averaging RNA for cluster 12"
## [1] "Finished averaging RNA for cluster 13"
## [1] "Finished averaging RNA for cluster 14"
## [1] "Finished averaging RNA for cluster 15"
## [1] "Finished averaging RNA for cluster 16"
## [1] "Finished averaging RNA for cluster 17"
## [1] "Finished averaging RNA for cluster 18"
## [1] "Finished averaging RNA for cluster 19"
## [1] "Finished averaging RNA for cluster 1"
## [1] "Finished averaging RNA for cluster 2"
## [1] "Finished averaging RNA for cluster 3"
## [1] "Finished averaging RNA for cluster 4"
## [1] "Finished averaging RNA for cluster 5"
## [1] "Finished averaging RNA for cluster 6"
## [1] "Finished averaging RNA for cluster 7"
## [1] "Finished averaging RNA for cluster 8"
## [1] "Finished averaging RNA for cluster 9"
## [1] "Finished averaging RNA for cluster 10"
## [1] "Finished averaging RNA for cluster 11"
## [1] "Finished averaging RNA for cluster 12"
## [1] "Finished averaging RNA for cluster 13"
## [1] "Finished averaging RNA for cluster 14"
## [1] "Finished averaging RNA for cluster 15"
## [1] "Finished averaging RNA for cluster 16"
## [1] "Finished averaging RNA for cluster 17"
## [1] "Finished averaging RNA for cluster 18"
plotClusterTree(seu.CCA)

重新计算out-of-bag error

oob <- AssessNodes(seu.CCA)
oob
##    node        oobe
## 1    19 0.027900824
## 2    20 0.030945669
## 3    23 0.004433186
## 4    30 0.014610390
## 5    31 0.022845275
## 6    24 0.012474161
## 7    25 0.002969121
## 8    35 0.016689847
## 9    26 0.004050223
## 10   28 0.091981700
## 11   29 0.008422852
## 12   34 0.010484593
## 13   21 0.008639053
## 14   22 0.002000762
## 15   27 0.013710618
## 16   32 0.098403921
## 17   33 0.026643747

这下所有节点的oobe<0.1,认为此时的聚类是可靠的(我们得到了18个cluster,文章中给出了16个cluster)

tsne.emb$ident <- factor(as.character(seu.CCA@ident), levels = levels(seu.CCA@ident))

ggplot(data = tsne.emb, aes(x=tSNE_1, y=tSNE_2, color=ident)) + 
  geom_point(size = .1) +
  facet_wrap(~sample_id, ncol = 3) + 
  guides(colour = guide_legend(title = "Cluster", 
                               title.hjust=0.5, 
                               keyheight = 1.5, 
                               override.aes = list(size=6))) + 
  theme_bw() + 
  theme(strip.text = element_text(size = 15, face = "bold"))

保存计算结果

saveRDS(oob, "tmp/s3.1-clustering_robustness.rds")
saveRDS(seu.CCA, "tmp/s3.2-seu_CCA_clustering.rds")